readcounts <- read.table("/Users/mar/Documents/GitHub/ANGSD_PROJECT_MOA4020/Project/STAR/featureCounts/Counts.txt", header=TRUE)
names(readcounts) <- gsub(".it2.Aligned.sortedByCoord.out.bam", "", gsub("X.athena.angsd.scratch.moa4020.project.GEO_Dataset.STAR_alignments_it2.alignment_it2_files.\\w+\\.","",names(readcounts)))
str(readcounts)
## 'data.frame': 49997 obs. of 14 variables:
## $ Geneid : chr "TrnP" "TrnT" "CYTB" "TrnE" ...
## $ Chr : chr "chrM" "chrM" "chrM" "chrM" ...
## $ Start : chr "15356" "15289" "14145" "14071" ...
## $ End : chr "15422" "15355" "15288" "14139" ...
## $ Strand : chr "-" "+" "+" "-" ...
## $ Length : int 67 67 1144 69 519 1824 71 59 67 1378 ...
## $ SPP1_KO_1: int 422 6 28865 18 10003 48134 37 141 47 21703 ...
## $ SPP1_KO_2: int 445 1 56344 100 14229 59595 63 66 54 30777 ...
## $ SPP1_KO_3: int 506 1 69231 97 17203 66883 68 41 120 36651 ...
## $ SPP1_KO_4: int 590 0 51625 45 17455 71666 76 132 87 34256 ...
## $ wt_1 : int 436 3 62797 40 13394 50629 62 64 61 32342 ...
## $ wt_2 : int 517 1 78611 42 16649 70607 79 119 103 41027 ...
## $ wt_3 : int 324 3 56111 21 11436 43486 55 54 75 26556 ...
## $ wt_4 : int 360 5 65895 44 13828 50778 59 56 95 31629 ...
Making a new matrix with just the counts and the transcript IDs for the row names
length(unique(readcounts[,1]))
## [1] 49997
length(readcounts[,1])
## [1] 49997
grouped_readcounts <- readcounts[,7:14]
row.names(grouped_readcounts) <- readcounts[,1]
Saving sample info for reference
sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(grouped_readcounts)), row.names = colnames(grouped_readcounts))
sample_info
## condition
## SPP1_KO_1 SPP1_KO
## SPP1_KO_2 SPP1_KO
## SPP1_KO_3 SPP1_KO
## SPP1_KO_4 SPP1_KO
## wt_1 wt
## wt_2 wt
## wt_3 wt
## wt_4 wt
str(sample_info)
## 'data.frame': 8 obs. of 1 variable:
## $ condition: chr "SPP1_KO" "SPP1_KO" "SPP1_KO" "SPP1_KO" ...
wd <- getwd()
filenames <- list.files(path = paste0(wd,"/Salmon"),recursive = TRUE,pattern="quant.genes.sf")
filepaths <- list.files(path = paste0(wd,"/Salmon"),recursive = TRUE,pattern="quant.genes.sf",full.names = TRUE)
# Copying transcript names from one file for salmonResultMatrix's row names
salmonTest <- read.csv(file=filepaths[8], sep = "\t")
salmonResultMatrix <- matrix(nrow=nrow(salmonTest),ncol=length(filenames)+1)
row.names(salmonResultMatrix) <- salmonTest[,1]
# iterate through all the quant.sf files and add the count values under each replicate
for(i in 1:length(filepaths)){
salmonResultMatrix[,i+1] <- read.csv(file=filepaths[i], sep = "\t")[,5]
}
# Adding column names for salmonResultMatrix
colnames(salmonResultMatrix) <- c("Length",gsub("/quant.genes.sf","",filenames))
# Copying transcript length data from the test file
salmonResultMatrix[,1] <- salmonTest[,2]
# Copying just the count values to a new matrix - salmonCounts
salmonCounts <- salmonResultMatrix[,-1]
# Removing transcript version details from row names
rownames(salmonCounts) <- sub("\\..*", "", rownames(salmonCounts))
# Saving a reference of the condition for each sample
salmon_sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(salmonCounts)), row.names = colnames(salmonCounts))
nrow(salmonCounts)
## [1] 49152
salmon_sample_info
## condition
## SPP1_KO_1 SPP1_KO
## SPP1_KO_2 SPP1_KO
## SPP1_KO_3 SPP1_KO
## SPP1_KO_4 SPP1_KO
## wt_1 wt
## wt_2 wt
## wt_3 wt
## wt_4 wt
str(salmon_sample_info)
## 'data.frame': 8 obs. of 1 variable:
## $ condition: chr "SPP1_KO" "SPP1_KO" "SPP1_KO" "SPP1_KO" ...
Converting the read counts into integers and adding the row and column names
salmonCountData <- matrix(as.integer(ceiling(as.numeric(salmonCounts))), ncol = 8)
row.names(salmonCountData) <- row.names(salmonCounts)
colnames(salmonCountData) <- colnames(salmonCounts)
Creating a DESeq object from both the results
DESeq.ds <- DESeqDataSetFromMatrix(countData = grouped_readcounts,
colData = sample_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds
## class: DESeqDataSet
## dim: 49997 8
## metadata(1): version
## assays(1): counts
## rownames(49997): TrnP TrnT ... Xkr4 Gm26206
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(1): condition
salmon_DESeq.ds <- DESeqDataSetFromMatrix(countData = salmonCountData,
colData = salmon_sample_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 11 duplicate rownames
## were renamed by adding numbers
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
salmon_DESeq.ds
## class: DESeqDataSet
## dim: 49152 8
## metadata(1): version
## assays(1): counts
## rownames(49152): LOC100048823 LOC108168686 ... 1700067P10Rik Trem3
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(1): condition
head(counts(DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4 wt_1 wt_2 wt_3 wt_4
## TrnP 422 445 506 590 436 517 324 360
## TrnT 6 1 1 0 3 1 3 5
## CYTB 28865 56344 69231 51625 62797 78611 56111 65895
## TrnE 18 100 97 45 40 42 21 44
## ND6 10003 14229 17203 17455 13394 16649 11436 13828
## ND5 48134 59595 66883 71666 50629 70607 43486 50778
colSums(counts(DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4 wt_1 wt_2 wt_3 wt_4
## 15330724 20682764 23315712 19236624 23547362 26726542 20706864 24887319
par(las=2)
colSums(counts(DESeq.ds)) %>% barplot(main = "STAR")
dim(DESeq.ds)
## [1] 49997 8
head(counts(salmon_DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4 wt_1 wt_2 wt_3 wt_4
## LOC100048823 0 0 0 0 0 0 0 0
## LOC108168686 0 0 3 0 3 0 0 0
## LOC100861738 0 0 0 5 0 0 0 0
## LOC100861749 0 0 2 0 3 0 2 0
## LOC100043931 0 0 0 0 0 0 0 0
## LOC673652 0 0 0 0 0 0 0 0
colSums(counts(salmon_DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4 wt_1 wt_2 wt_3 wt_4
## 15950982 21386879 24139399 19968988 24419887 27858579 21506463 25828594
par(las=2)
colSums(counts(salmon_DESeq.ds)) %>% barplot(main = "Salmon")
dim(salmon_DESeq.ds)
## [1] 49152 8
Removing genes with no counts in all the samples:
keep_genes <- rowSums(counts(DESeq.ds)) > 0
DESeq.ds <- DESeq.ds[ keep_genes, ]
dim(DESeq.ds)
## [1] 29710 8
counts(DESeq.ds) %>% str
## int [1:29710, 1:8] 422 6 28865 18 10003 48134 37 141 47 21703 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:29710] "TrnP" "TrnT" "CYTB" "TrnE" ...
## ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
assay(DESeq.ds) %>% str
## int [1:29710, 1:8] 422 6 28865 18 10003 48134 37 141 47 21703 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:29710] "TrnP" "TrnT" "CYTB" "TrnE" ...
## ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
salmon_keep_genes <- rowSums(counts(salmon_DESeq.ds)) > 0
salmon_DESeq.ds <- salmon_DESeq.ds[ salmon_keep_genes, ]
dim(salmon_DESeq.ds)
## [1] 26913 8
counts(salmon_DESeq.ds) %>% str
## int [1:26913, 1:8] 0 0 0 3 476 0 0 362 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:26913] "LOC108168686" "LOC100861738" "LOC100861749" "Spry3" ...
## ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
assay(salmon_DESeq.ds) %>% str
## int [1:26913, 1:8] 0 0 0 3 476 0 0 362 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:26913] "LOC108168686" "LOC100861738" "LOC100861749" "Spry3" ...
## ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
DESeq.ds <- estimateSizeFactors(DESeq.ds) # calculate SFs, add them to object
plot( sizeFactors(DESeq.ds), colSums(counts(DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6 , main="STAR")
salmon_DESeq.ds <- estimateSizeFactors(salmon_DESeq.ds) # calculate SFs, add them to object
plot(sizeFactors(salmon_DESeq.ds), colSums(counts(salmon_DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6, main="Salmon")
## bp of non-normalized
boxplot(log2(counts(DESeq.ds) +1), notch=TRUE,
main = "STAR Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## bp of size-factor normalized values
boxplot(log2(counts(DESeq.ds, normalized=TRUE)+1), notch=TRUE,
main = "STAR Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## bp of non-normalized
boxplot(log2(counts(salmon_DESeq.ds)+1), notch=TRUE,
main = "Salmon Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## bp of size-factor normalized values
boxplot(log2(counts(salmon_DESeq.ds, normalized=TRUE)+1), notch=TRUE,
main = "Salmon Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## non-normalized read counts plus pseudocount
log.counts <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(DESeq.ds, "log.counts") <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## normalized read counts
assay(DESeq.ds, "log.norm.counts") <- log2(counts(DESeq.ds, normalized=TRUE) + 1)
DESeq.ds[, c("wt_1","wt_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "wt_1 vs. wt_2")
DESeq.ds[, c("SPP1_KO_1","SPP1_KO_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "SPP1_KO_1 vs SPP1_KO_2")
## non-normalized read counts plus pseudocount
log.counts <- log2(counts(salmon_DESeq.ds, normalized = FALSE) + 1)
## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(salmon_DESeq.ds, "log.counts") <- log2(counts(salmon_DESeq.ds, normalized = FALSE) + 1)
## normalized read counts
assay(salmon_DESeq.ds, "log.norm.counts") <- log2(counts(salmon_DESeq.ds, normalized=TRUE) + 1)
salmon_DESeq.ds[, c("wt_1","wt_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "wt_1 vs. wt_2")
salmon_DESeq.ds[, c("SPP1_KO_1","SPP1_KO_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "SPP1_KO_1 vs SPP1_KO_2")
## generate the plot
msd_plot <- vsn::meanSdPlot(assay(DESeq.ds, "log.norm.counts"),
ranks=FALSE, # show the data on the original scale
plot = FALSE)
msd_plot$gg +
ggtitle("Sequencing depth normalized log2(read counts)") +
ylab("standard deviation")
## generate the plot
msd_plot <- vsn::meanSdPlot(assay(salmon_DESeq.ds, "log.norm.counts"),
ranks=FALSE, # show the data on the original scale
plot = FALSE)
msd_plot$gg +
ggtitle("Sequencing depth normalized log2(read counts)") +
ylab("standard deviation")
DESeq.rlog <- rlog(DESeq.ds, blind = TRUE)
par(mfrow=c(1,2))
plot(assay(DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(DESeq.rlog)[,1:2],
cex=.1, main = "rlog transformed",
xlab = colnames(assay(DESeq.rlog[,1])),
ylab = colnames(assay(DESeq.rlog[,2])) )
salmon_DESeq.rlog <- rlog(salmon_DESeq.ds, blind = TRUE)
par(mfrow=c(1,2))
plot(assay(salmon_DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(salmon_DESeq.rlog)[,1:2],
cex=.1, main = "rlog transformed",
xlab = colnames(assay(salmon_DESeq.rlog[,1])),
ylab = colnames(assay(salmon_DESeq.rlog[,2])) )
rlog.norm.counts <- assay(DESeq.rlog)
## rlog-transformed read counts
msd_plot <- vsn::meanSdPlot(assay(DESeq.rlog), ranks=FALSE, plot = FALSE)
msd_plot$gg + ggtitle("Following rlog transformation") +
coord_cartesian(ylim = c(0,3))
salmon_rlog.norm.counts <- assay(salmon_DESeq.rlog)
## rlog-transformed read counts
salmon_msd_plot <- vsn::meanSdPlot(assay(salmon_DESeq.rlog), ranks=FALSE, plot = FALSE)
salmon_msd_plot$gg + ggtitle("Following rlog transformation") +
coord_cartesian(ylim = c(0,3))
save.image(file = "DESeqReadyFiles.RData")
FILE_DSD="DESeqReadyFiles.RData"
load(FILE_DSD)
DESeq.ds
## class: DESeqDataSet
## dim: 29710 8
## metadata(1): version
## assays(3): counts log.counts log.norm.counts
## rownames(29710): TrnP TrnT ... Gm7341 Xkr4
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt wt wt wt
## Levels: SPP1_KO wt
salmon_DESeq.ds
## class: DESeqDataSet
## dim: 26913 8
## metadata(1): version
## assays(3): counts log.counts log.norm.counts
## rownames(26913): LOC108168686 LOC100861738 ... 4921531C22Rik Trem3
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
salmon_DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt wt wt wt
## Levels: SPP1_KO wt
DESeq.ds$condition %<>% relevel(ref="wt")
DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt wt wt wt
## Levels: wt SPP1_KO
salmon_DESeq.ds$condition %<>% relevel(ref="wt")
salmon_DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt wt wt wt
## Levels: wt SPP1_KO
design(DESeq.ds)
## ~condition
DESeq.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
DESeq.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
DESeq.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
DESeq.ds %<>% nbinomWaldTest()
DESeq.ds
## class: DESeqDataSet
## dim: 29710 8
## metadata(1): version
## assays(6): counts log.counts ... H cooks
## rownames(29710): TrnP TrnT ... Gm7341 Xkr4
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
design(salmon_DESeq.ds)
## ~condition
salmon_DESeq.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
salmon_DESeq.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
salmon_DESeq.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene
salmon_DESeq.ds %<>% nbinomWaldTest()
salmon_DESeq.ds
## class: DESeqDataSet
## dim: 26913 8
## metadata(1): version
## assays(6): counts log.counts ... H cooks
## rownames(26913): LOC108168686 LOC100861738 ... 4921531C22Rik Trem3
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
rowData(DESeq.ds) %>% colnames
## [1] "baseMean"
## [2] "baseVar"
## [3] "allZero"
## [4] "dispGeneEst"
## [5] "dispGeneIter"
## [6] "dispFit"
## [7] "dispersion"
## [8] "dispIter"
## [9] "dispOutlier"
## [10] "dispMAP"
## [11] "Intercept"
## [12] "condition_SPP1_KO_vs_wt"
## [13] "SE_Intercept"
## [14] "SE_condition_SPP1_KO_vs_wt"
## [15] "WaldStatistic_Intercept"
## [16] "WaldStatistic_condition_SPP1_KO_vs_wt"
## [17] "WaldPvalue_Intercept"
## [18] "WaldPvalue_condition_SPP1_KO_vs_wt"
## [19] "betaConv"
## [20] "betaIter"
## [21] "deviance"
## [22] "maxCooks"
rowData(DESeq.ds)$WaldPvalue_condition_SPP1_KO_vs_wt %>% hist(breaks=19, main="Raw p-values for SPP1_KO vs wt")
rowData(salmon_DESeq.ds) %>% colnames
## [1] "baseMean"
## [2] "baseVar"
## [3] "allZero"
## [4] "dispGeneEst"
## [5] "dispGeneIter"
## [6] "dispFit"
## [7] "dispersion"
## [8] "dispIter"
## [9] "dispOutlier"
## [10] "dispMAP"
## [11] "Intercept"
## [12] "condition_SPP1_KO_vs_wt"
## [13] "SE_Intercept"
## [14] "SE_condition_SPP1_KO_vs_wt"
## [15] "WaldStatistic_Intercept"
## [16] "WaldStatistic_condition_SPP1_KO_vs_wt"
## [17] "WaldPvalue_Intercept"
## [18] "WaldPvalue_condition_SPP1_KO_vs_wt"
## [19] "betaConv"
## [20] "betaIter"
## [21] "deviance"
## [22] "maxCooks"
rowData(salmon_DESeq.ds)$WaldPvalue_condition_SPP1_KO_vs_wt %>% hist(breaks=19, main="Raw p-values for SPP1_KO vs wt")
DGE.results <- results(DESeq.ds, independentFiltering = TRUE, alpha = 0.05)
# the first line will tell you which comparison was done to achieve the log2FC
head(DGE.results)
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## TrnP 452.39676 0.5955603 0.166828 3.5698978 3.57121e-04 0.001950977
## TrnT 2.64012 -0.1183985 1.296677 -0.0913092 9.27247e-01 0.963156134
## CYTB 56565.35021 -0.0764638 0.135832 -0.5629292 5.73483e-01 0.722699399
## TrnE 49.54153 1.0708467 0.417314 2.5660481 1.02865e-02 0.035636123
## ND6 14106.62831 0.3970080 0.120152 3.3042057 9.52459e-04 0.004619597
## ND5 57548.84414 0.5173162 0.128589 4.0230240 5.74556e-05 0.000379555
salmon_DGE.results <- results(salmon_DESeq.ds, independentFiltering = TRUE, alpha = 0.05)
# the first line will tell you which comparison was done to achieve the log2FC
head(salmon_DGE.results)
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## LOC108168686 0.676856 0.0934023 3.359614 0.0278015 0.977820474
## LOC100861738 0.703059 3.1083170 3.444056 0.9025165 0.366782552
## LOC100861749 0.814896 -1.1609272 2.713580 -0.4278212 0.668781280
## Spry3 30.536510 -1.7286235 0.493379 -3.5036419 0.000458942
## Vamp7 729.855253 -0.0455607 0.114357 -0.3984069 0.690330288
## LOC105243599 0.488945 -1.0807886 3.121793 -0.3462077 0.729186621
## padj
## <numeric>
## LOC108168686 NA
## LOC100861738 NA
## LOC100861749 NA
## Spry3 0.00240699
## Vamp7 0.80986940
## LOC105243599 NA
summary(DGE.results)
##
## out of 29710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 3344, 11%
## LFC < 0 (down) : 3053, 10%
## outliers [1] : 2, 0.0067%
## low counts [2] : 9216, 31%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(salmon_DGE.results)
##
## out of 26913 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 3296, 12%
## LFC < 0 (down) : 3058, 11%
## outliers [1] : 22, 0.082%
## low counts [2] : 6783, 25%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# the DESeqResult object can basically be handled like a data.frame
table(DGE.results$padj < 0.05)
##
## FALSE TRUE
## 14095 6397
table(salmon_DGE.results$padj < 0.05)
##
## FALSE TRUE
## 13754 6354
DGE.results$padj %>%
hist(breaks=19, main="STAR adjusted p-values for SPP1_KO vs WT")
salmon_DGE.results$padj %>%
hist(breaks=19, main="Salmon adjusted p-values for SPP1_KO vs WT")
DGE.results.sorted <- DGE.results %>% `[`(order(.$padj),)
head(DGE.results.sorted)
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## Spp1 14917.826 -7.11281 0.1611036 -44.1505 0.00000e+00 0.00000e+00
## Mrc1 1130.756 4.33284 0.1201555 36.0603 9.52727e-285 9.76164e-281
## Cd93 1779.027 4.93173 0.1484807 33.2146 6.61766e-242 4.52031e-238
## Xdh 967.098 3.70743 0.1148878 32.2700 1.84404e-228 9.44704e-225
## Pld4 1303.429 3.32855 0.1110480 29.9739 2.14610e-197 8.79558e-194
## Pik3ap1 1679.039 2.76266 0.0937521 29.4677 7.46436e-191 2.54933e-187
salmon_DGE.results.sorted <- salmon_DGE.results %>% `[`(order(.$padj),)
head(salmon_DGE.results.sorted)
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## Mrc1 1134.00 4.37009 0.1158709 37.7152 0.00000e+00 0.00000e+00
## Spp1 15131.58 -6.29562 0.1065286 -59.0979 0.00000e+00 0.00000e+00
## Cd93 1760.21 4.95606 0.1494439 33.1634 3.63555e-241 2.43679e-237
## Xdh 965.02 3.70092 0.1128660 32.7904 8.06465e-236 4.05410e-232
## Pik3ap1 1723.48 2.77874 0.0907901 30.6062 1.01213e-205 4.07037e-202
## Pld4 1274.49 3.34293 0.1111628 30.0724 1.11255e-198 3.72854e-195
par(mfrow=c(1,2))
plotCounts(DESeq.ds, gene="Mrc1", normalized = TRUE, xlab="")
plotCounts(DESeq.ds, gene = which.max(DGE.results$padj), xlab="",
main = "Gene with max. p.adj.\n(=least significant)")
par(mfrow=c(1,2))
plotCounts(salmon_DESeq.ds, gene="Mrc1", normalized = TRUE, xlab="")
plotCounts(salmon_DESeq.ds, gene = which.max(salmon_DGE.results$padj), xlab="",
main = "Gene with max. p.adj.\n(=least significant)")
# identify genes with the desired adjusted p-value cut-off
DGEgenes <- rownames(subset(DGE.results.sorted, padj < 0.05))
# extract rlog-transformed values into a matrix
rlog.dge <- DESeq.rlog[DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(rlog.dge, scale="none",
show_rownames=FALSE, main="STAR DGE (no scaling)",
color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))
# identify genes with the desired adjusted p-value cut-off
salmon_DGEgenes <- rownames(subset(salmon_DGE.results.sorted, padj < 0.05))
# extract rlog-transformed values into a matrix
salmon_rlog.dge <- salmon_DESeq.rlog[salmon_DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(salmon_rlog.dge, scale="none",
show_rownames=FALSE, main="Salmon DGE (no scaling)",
color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))
pheatmap(rlog.dge, scale="row",
show_rownames=FALSE, main="STAR DGE (row-based z-score)")
pheatmap(salmon_rlog.dge, scale="row",
show_rownames=FALSE, main="Salmon DGE (row-based z-score)")
DESeq2::plotMA(DGE.results, alpha=0.05,
main="STAR Test: p.adj.value < 0.05", ylim = c(-4,4))
DESeq2::plotMA(salmon_DGE.results, alpha=0.05,
main="Salmon Test: p.adj.value < 0.05", ylim = c(-4,4))
vp1.1 <- EnhancedVolcano(DGE.results,
lab=rownames(DGE.results),
x='log2FoldChange', y='padj',
pCutoff=0.05,
title="STAR SPP1_KO / wt")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
print(vp1.1)
vp1.2 <- EnhancedVolcano(salmon_DGE.results,
lab=rownames(salmon_DGE.results),
x='log2FoldChange', y='padj',
pCutoff=0.05,
title="Salmon SPP1_KO / wt")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
print(vp1.2)
DGE.results.shrnk <- lfcShrink(DESeq.ds,
coef=2,
type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resultsNames(DESeq.ds)
## [1] "Intercept" "condition_SPP1_KO_vs_wt"
salmon_DGE.results.shrnk <- lfcShrink(salmon_DESeq.ds,
coef=2,
type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resultsNames(salmon_DESeq.ds)
## [1] "Intercept" "condition_SPP1_KO_vs_wt"
par(mfrow = c(1,2))
DESeq2::plotMA(DGE.results, alpha=0.05,
main="no shrinkage", ylim=c(-4,4))
DESeq2::plotMA(DGE.results.shrnk, alpha=0.05,
main="with logFC shrinkage", ylim=c(-4,4))
par(mfrow = c(1,2))
DESeq2::plotMA(salmon_DGE.results, alpha=0.05,
main="no shrinkage", ylim=c(-4,4))
DESeq2::plotMA(salmon_DGE.results.shrnk, alpha=0.05,
main="with logFC shrinkage", ylim=c(-4,4))
vp2.1 <- EnhancedVolcano(DGE.results.shrnk, lab=rownames(DGE.results.shrnk),
x='log2FoldChange', y='padj', pCutoff = 0.05,
title="with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
vp1.1 + vp2.1
vp2.2 <- EnhancedVolcano(salmon_DGE.results.shrnk, lab=rownames(salmon_DGE.results.shrnk),
x='log2FoldChange', y='padj', pCutoff = 0.05,
title="with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
vp1.2 + vp2.2
DGE.results %>% `[`(order(.$padj),) %>% head
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## Spp1 14917.826 -7.11281 0.1611036 -44.1505 0.00000e+00 0.00000e+00
## Mrc1 1130.756 4.33284 0.1201555 36.0603 9.52727e-285 9.76164e-281
## Cd93 1779.027 4.93173 0.1484807 33.2146 6.61766e-242 4.52031e-238
## Xdh 967.098 3.70743 0.1148878 32.2700 1.84404e-228 9.44704e-225
## Pld4 1303.429 3.32855 0.1110480 29.9739 2.14610e-197 8.79558e-194
## Pik3ap1 1679.039 2.76266 0.0937521 29.4677 7.46436e-191 2.54933e-187
salmon_DGE.results %>% `[`(order(.$padj),) %>% head
## log2 fold change (MLE): condition SPP1 KO vs wt
## Wald test p-value: condition SPP1 KO vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## Mrc1 1134.00 4.37009 0.1158709 37.7152 0.00000e+00 0.00000e+00
## Spp1 15131.58 -6.29562 0.1065286 -59.0979 0.00000e+00 0.00000e+00
## Cd93 1760.21 4.95606 0.1494439 33.1634 3.63555e-241 2.43679e-237
## Xdh 965.02 3.70092 0.1128660 32.7904 8.06465e-236 4.05410e-232
## Pik3ap1 1723.48 2.77874 0.0907901 30.6062 1.01213e-205 4.07037e-202
## Pld4 1274.49 3.34293 0.1111628 30.0724 1.11255e-198 3.72854e-195
STAR_gene_name <- row.names(subset(DGE.results, padj < 0.05))
write.table(cbind(STAR_gene_name,subset(DGE.results, padj < 0.05)),
file="STAR_DESeq2results_wt-vs-SPP1_KO.txt",
sep="\t", quote=FALSE, row.names=FALSE)
salmon_gene_name <- row.names(subset(salmon_DGE.results, padj < 0.05))
write.table(cbind(salmon_gene_name,subset(salmon_DGE.results, padj < 0.05)),
file="Salmon_DESeq2results_wt-vs-SPP1_KO.txt",
sep="\t", quote=FALSE, row.names=FALSE)